Loading packages:

library(Seurat)
library(tidyverse)
library(gprofiler2)
library(sleepwalk)
library(SCINA)
library(gridExtra)
theme_set(theme_bw(base_size = 14))

#Loading data

setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon 2024/Processed data to be loaded in R")

data_dir_1 <- '/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon 2024/Processed data to be loaded in R/de_novo_AML1'
data_dir_2 <- '/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon 2024/Processed data to be loaded in R/de_novo_AML2'
data_dir_3 <- '/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon 2024/Processed data to be loaded in R/de_novo_AML3'

AML_1 <- Read10X(
  data_dir_1,
  gene.column = 2,
  cell.column = 1,
  unique.features = TRUE,
  strip.suffix = FALSE
)

AML_2 <- Read10X(
  data_dir_2,
  gene.column = 2,
  cell.column = 1,
  unique.features = TRUE,
  strip.suffix = FALSE
)

AML_3 <- Read10X(
  data_dir_3,
  gene.column = 2,
  cell.column = 1,
  unique.features = TRUE,
  strip.suffix = FALSE
)
seurat_object_AML_1 = CreateSeuratObject(AML_1)
seurat_object_AML_2 = CreateSeuratObject(AML_2)
seurat_object_AML_3 = CreateSeuratObject(AML_3)
seurat_object_AML_1$orig.ident = 'AML_1'
seurat_object_AML_2$orig.ident = 'AML_2'
seurat_object_AML_3$orig.ident = 'AML_3'

denovo_AML_data <- merge(seurat_object_AML_1, y = c(seurat_object_AML_2, seurat_object_AML_3), project = "AML_merged")
## Warning in CheckDuplicateCellNames(object.list = objects): Some cell names are
## duplicated across objects provided. Renaming to enforce unique cell names.
as_tibble(denovo_AML_data@meta.data) %>%
  group_by(orig.ident) %>%
  summarise(initial_cells=n()) -> cell_counts

cell_counts
PercentageFeatureSet(denovo_AML_data,pattern="^MT-") -> denovo_AML_data$percent_mt
PercentageFeatureSet(denovo_AML_data,pattern="MALAT1") -> denovo_AML_data$percent_Malat
apply(denovo_AML_data@assays$RNA@counts, 2, function(x) max((100*x)/sum(x))) -> denovo_AML_data$percent_Largest

denovo_AML_data[[]]
as_tibble(denovo_AML_data@meta.data) -> qc_metrics
head(qc_metrics)

#QC

qc_metrics %>%
  pivot_longer(cols=-orig.ident, names_to="metric",values_to="value") %>%
  ggplot(aes(x=orig.ident, y=value)) +
  geom_violin(fill="lightskyblue")+
  facet_grid(rows=vars(metric), scales="free_y")

ggsave(paste0("Violin plot linear scale no intercept "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 10, dpi = 300)

qc_metrics %>%
  pivot_longer(cols=-orig.ident, names_to="metric",values_to="value") %>%
  ggplot(aes(x=orig.ident, y=log(value))) +
  geom_violin(fill="lightskyblue")+
  facet_grid(rows=vars(metric), scales="free_y")

ggsave(paste0("Violin plot log scale no intercept "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 10, dpi = 300)
qc_metrics %>%
  ggplot(aes(x=orig.ident, y=log(nFeature_RNA))) +
  geom_violin(fill="lightskyblue")+
                geom_hline(yintercept = log(1500))+
                geom_hline(yintercept = log(8800))

ggsave(paste0("Violin plot log scale yintercept 1000 to 8800 feature "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 5, dpi = 300)

qc_metrics %>%
  ggplot(aes(x=orig.ident, y=log(percent_mt))) +
  geom_violin(fill="lightskyblue")+
                geom_hline(yintercept = log(20))
## Warning: Removed 37 rows containing non-finite values (`stat_ydensity()`).

ggsave(paste0("Violin plot log scale yintercept 15 percent_mt "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 5, dpi = 300)
## Warning: Removed 37 rows containing non-finite values (`stat_ydensity()`).
qc_metrics %>%
  ggplot(aes(x=orig.ident, y=log(percent_Largest))) +
  geom_violin(fill="lightskyblue")+
                geom_hline(yintercept = log(15))

ggsave(paste0("Violin plot log scale yintercept 20 percent_largest_gene "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 5, dpi = 300)

qc_metrics %>%
  ggplot(aes(x=orig.ident, y=log(percent_Malat))) +
  geom_violin(fill="lightskyblue")+
                geom_hline(yintercept = log(20))
## Warning: Removed 179 rows containing non-finite values (`stat_ydensity()`).

ggsave(paste0("Violin plot log scale yintercept 20 percent_Malat "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 5, dpi = 300)
## Warning: Removed 179 rows containing non-finite values (`stat_ydensity()`).
subset(
  denovo_AML_data,
  nFeature_RNA > 1500 & 
  nFeature_RNA < 8800 &
  percent_mt < 20 & 
  percent_Malat < 20 
) -> denovo_AML_data

denovo_AML_data
## An object of class Seurat 
## 36601 features across 12599 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
as_tibble(denovo_AML_data@meta.data) -> qc_metrics_filtered

qc_metrics_filtered %>%
  pivot_longer(cols=-orig.ident, names_to="metric",values_to="value") %>%
  ggplot(aes(x=orig.ident, y=value)) +
  geom_violin(fill="lightskyblue")+
  facet_grid(rows=vars(metric), scales="free_y")

ggsave(paste0("Violin plot post filtering linear scale no intercept "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 10, dpi = 300)


qc_metrics_filtered %>%
  pivot_longer(cols=-orig.ident, names_to="metric",values_to="value") %>%
  ggplot(aes(x=orig.ident, y=log(value))) +
  geom_violin(fill="lightskyblue")+
  facet_grid(rows=vars(metric), scales="free_y")

ggsave(paste0("Violin plot post filtering log scale no intercept "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 10, dpi = 300)
setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon 2024")

saveRDS(denovo_AML_data, file="denovo_AML_data_filtered_latest.RDS")
saveRDS(qc_metrics_filtered, file="denovoAML_qc_metrics_filtered_latest.RDS")

#Normalisation

setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon 2024")

#denovo_AML_data <- readRDS("denovo_AML_data_filtered_latest.RDS")
#qc_metrics_filtered <- readRDS("denovoAML_qc_metrics_filtered_latest.RDS")

NormalizeData(denovo_AML_data, normalization.method = "CLR") -> denovo_AML_data

saveRDS(denovo_AML_data, file="denovo_AML_data_norm_filtered_latest.RDS")

Post norm check

Then we can check how well the normalisation actually works

setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon 2024")

#denovo_AML_data <- readRDS("denovo_AML_data_norm_filtered_latest.RDS")
#qc_metrics_filtered <- readRDS("denovoAML_qc_metrics_filtered_latest.RDS")

subset <- denovo_AML_data@assays$RNA@data[,c(rep(F,69),T)] %>%
  as_tibble() %>%
  pivot_longer(
    cols=everything(),
    names_to="cell",
    values_to="value"
  ) %>%
  left_join(
    as_tibble(denovo_AML_data@meta.data, rownames="cell") %>% 
      dplyr::select(cell,orig.ident)
  ) 

subset %>%
  ggplot(aes(x=value, colour=orig.ident, group=cell)) +
  geom_density(linewidth=0.25)+
  coord_cartesian(xlim=c(0,3), ylim=c(0,1)) +
  scale_colour_brewer(palette = "Set1")

ggsave(paste0("Post norm check all samples together "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 5, dpi = 300)

subset %>%
  ggplot(aes(x=value, colour=orig.ident, group=cell)) +
  geom_density(linewidth=0.25)+
  coord_cartesian(xlim=c(0,3), ylim=c(0,1)) +
  scale_colour_brewer(palette = "Set1") +
  facet_wrap(vars(orig.ident))

ggsave(paste0("Post norm check 4 samples separate "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 5, dpi = 300)
setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon 2024")

#denovo_AML_data <- readRDS("denovo_AML_data_norm_filtered_latest.RDS")
#qc_metrics_filtered <- readRDS("denovoAML_qc_metrics_filtered_latest.RDS")
FindVariableFeatures(
  denovo_AML_data,
  selection.method = "vst",
  nfeatures = 500
) -> denovo_AML_data

as_tibble(HVFInfo(denovo_AML_data),rownames = "Gene") -> variance.data

variance.data %>% 
  mutate(hypervariable=Gene %in% VariableFeatures(denovo_AML_data)
) -> variance.data

variance.data %>%
  arrange(desc(variance.standardized)) -> variance.data

head(variance.data,n=100)

Plot variable genes

setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon 2024")

variance.data %>%
  arrange(hypervariable) %>%
  ggplot(aes(x=mean,y=variance, colour=hypervariable)) +
  geom_point() +
  scale_y_log10() +
  scale_x_log10() +
  scale_colour_manual(values=c("grey","red2"))
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis

ggsave(paste0("HVG gene plot "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 5, dpi = 300)
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous x-axis
#Extract gene list for GO analysis
#https://www.bioinformatics.babraham.ac.uk/goliath/goliath.cgi

variance.data %>%
  arrange(desc(variance.standardized)) %>%
  slice(1:100) %>%
  pull(Gene) %>%
  sapply(function(x)cat(paste0(x,"\n"))) -> temp
## JCHAIN
## IGKC
## IGHA1
## IGHG1
## GNLY
## IGLC2
## MZB1
## PRTN3
## CTSG
## TPSB2
## ELANE
## MPO
## LSAMP
## C1QB
## PTGDS
## TXNDC5
## IL32
## IGHA2
## IFI27
## IGHV3-30
## AZU1
## IGLL1
## IGHG2
## IGLC3
## FCER1A
## C1QC
## IGHV3-33
## NRXN3
## TPSD1
## FN1
## IGLV1-40
## IGHGP
## LTBP1
## BGLAP
## CCL4
## IGKV3-20
## IGLV2-14
## GZMB
## AC092691.1
## PRF1
## IGHG3
## IGLV6-57
## ACSM3
## NKG7
## IGKV1-5
## C1QA
## S100B
## AREG
## IL1R2
## TYMS
## G0S2
## IGHV1-18
## IGHV3-7
## IGLC1
## CD1C
## RRM2
## AQP3
## IGHV3-23
## CCL5
## THBS1
## GZMA
## CST7
## MT1G
## TRBV7-9
## CD27
## GZMH
## TRBV4-2
## MS4A1
## IGHG4
## IGHV4-39
## S100A8
## IGFBP2
## S100A9
## UBE2C
## ZNF385D
## CCDC26
## MS4A3
## AC109466.1
## GZMK
## TRBV9
## CCL4L2
## EREG
## AC233755.1
## IGKV4-1
## DPP10
## DERL3
## TRBV5-1
## MT1H
## HLA-DQA1
## LINC01478
## IGHV3-64D
## LINC02147
## IGHV4-31
## IGHV6-1
## SELENOP
## TNFRSF17
## IGHV3-48
## RHOXF1-AS1
## IGHV4-59
## IGHV3-66

Save variable gene and generated gene list

setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon 2024")

saveRDS(variance.data, file="variable_genes_latest.RDS")
saveRDS(temp, file="variable_genes_list_for_GO_latest.RDS")

A bit of data exploration:

qc_metrics_filtered %>%
  pivot_longer(cols=-orig.ident, names_to="metric",values_to="value") %>%
  ggplot(aes(x=orig.ident, y=value)) +
  geom_violin(fill="lightskyblue")+
  facet_grid(rows=vars(metric), scales="free_y")

Dim red with PCA using var genes

setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon 2024")

ScaleData(denovo_AML_data, features=rownames(data)) -> denovo_AML_data
## Centering and scaling data matrix
RunPCA(
  denovo_AML_data,
  features=VariableFeatures(denovo_AML_data)
) -> denovo_AML_data
## PC_ 1 
## Positive:  IFITM1, LY6E, ISG15, IFI6, MDK, AL589693.1, IGHM, TRH, BAALC, INPP4B 
##     ISG20, CD69, TOX, SOX4, NPW, KCNQ5, H1F0, PRSS57, ANGPT1, APP 
##     CD34, DNTT, IFITM3, CASC15, HLA-DQA1, STMN1, C1QTNF4, CALCRL, HLA-DPB1, HLA-DPA1 
## Negative:  LYZ, VCAN, S100A9, CFD, S100A8, IFI30, CST3, SERPINA1, MS4A6A, PSAP 
##     AL163541.1, CD14, FCN1, SLC8A1, CD36, LMNA, CTSD, CD163, NCF1, NEAT1 
##     MAFB, AC020656.1, ACSL1, CPVL, CRIP1, ARHGAP24, HMOX1, C1QA, FNDC3B, F13A1 
## PC_ 2 
## Positive:  TYMS, PCLAF, AZU1, CTSG, ELANE, IGFBP2, CENPM, UHRF1, RRM2, PLPPR3 
##     CLSPN, MCM7, TK1, ATP8B4, PCNA, MIR924HG, MKI67, H2AFZ, MYBL2, DIAPH3 
##     HIST1H4C, STMN1, MCM4, MS4A3, ST8SIA6, PRTN3, HIST1H3B, MPO, TUBB, MEIS1 
## Negative:  LILRA4, DRD4, LINC01478, SPIB, HLA-DPA1, NIBAN3, LINC01374, UGCG, ZFP36L1, HLA-DQA1 
##     PLD4, HLA-DPB1, HLA-DRA, EPHB1, CLIC3, VSIG4, PTPRS, CLEC10A, TSPAN13, FCN1 
##     ZFAT, CD300E, C1QA, CD14, NAPSA, HLA-DQB1, HLA-DRB1, FCGR3A, C1QB, ISG20 
## PC_ 3 
## Positive:  IL32, PRF1, CST7, GZMA, GZMH, CD3E, GZMB, CD247, GZMM, LCK 
##     SKAP1, KLRD1, SYNE2, FGFBP2, CD3D, GNLY, CCL5, NKG7, CD8A, TRBC1 
##     TGFBR3, CD3G, TRBC2, BCL11B, ARL4C, THEMIS, CD8B, CCL4, CAMK4, KLRC2 
## Negative:  PRSS57, MDK, ATP8B4, TRH, ANGPT1, AL589693.1, BAALC, NPW, CD34, KCNQ5 
##     CASC15, CALCRL, HLA-DRA, SOX4, HSPB1, INPP4B, SLC24A3, TPSAB1, DNTT, H1F0 
##     C1QTNF4, HLA-DRB1, LINC02147, IFITM3, HLA-DPB1, SPINK2, MAMDC2, IGHM, HLA-DQB1, TOX 
## PC_ 4 
## Positive:  LILRA4, LINC01478, UGCG, DRD4, NIBAN3, PLD4, PTPRS, ZFAT, SPIB, EPHB1 
##     LINC01374, DERL3, JCHAIN, MZB1, TCF4, CLIC3, IRF4, TSPAN13, IGKC, NAPSA 
##     TXNDC5, RUNX2, APP, HSP90B1, AFF3, HDAC9, RASD1, MYBL2, SEC11C, HSPA5 
## Negative:  TRH, MDK, BAALC, AL589693.1, INPP4B, ANGPT1, CD34, CD69, NPW, DNTT 
##     CCL5, TOX, CALCRL, IL32, MT2A, CASC15, C1QTNF4, IFITM3, IFI6, PRF1 
##     GZMA, NEAT1, GZMH, PRSS57, CST7, GZMM, ZNF385D, CD3E, HSPB1, ZFP36L1 
## PC_ 5 
## Positive:  HLA-DRB1, HLA-DRA, HLA-DPA1, HLA-DPB1, HLA-DQB1, TUBA1B, HLA-DQA1, H2AFZ, TUBB, IFITM3 
##     CST3, IFI30, CLEC10A, TYMS, IFI6, HSP90B1, TK1, ISG15, CRIP1, C1QA 
##     LY6E, HDAC9, PCLAF, FCER1A, PCNA, PSAP, PKIB, MT2A, NAPSA, RRM2 
## Negative:  ZNF521, LTBP1, ACSM3, SELENOP, CPA3, PLPPR3, DPP10, IGFBP2, LSAMP, IGLL1 
##     NCAM1, MPO, AC092691.1, MSI2, CNTN1, NRXN3, CAV2, SCN9A, MS4A3, AC244502.1 
##     CTSG, ELANE, PRTN3, TPSB2, AC109466.1, LINC02147, PTGER3, AZU1, EREG, TPSD1
PCAPlot(denovo_AML_data, dims=c(1,2), group.by= 'orig.ident')

ggsave(paste0("PCA dim 1&2 "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 6, dpi = 300)


PCAPlot(denovo_AML_data, dims=c(3,4), group.by= 'orig.ident')

ggsave(paste0("PCA dim 3&4 "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 6, dpi = 300)


PCAPlot(denovo_AML_data, dims=c(5,6), group.by= 'orig.ident')

ggsave(paste0("PCA dim 5&6 "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 6, dpi = 300)


PCAPlot(denovo_AML_data, dims=c(7,8), group.by= 'orig.ident')

ggsave(paste0("PCA dim 7&8 "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 6, dpi = 300)


PCAPlot(denovo_AML_data, dims=c(9,10), group.by= 'orig.ident')

ggsave(paste0("PCA dim 9&10 "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 6, dpi = 300)


ElbowPlot(denovo_AML_data, ndims=40)

ggsave(paste0("PCA SD vs dim "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 5, dpi = 300)
Idents(denovo_AML_data) <- 'orig.ident'
setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon 2024")

saved.seed <- 5540
set.seed(saved.seed)

RunUMAP(
  denovo_AML_data,
  dims=1:30,
  n.neighbors = 50,
  min.dist = 1,
  seed.use = saved.seed
) -> denovo_AML_data
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 11:54:36 UMAP embedding parameters a = 0.115 b = 1.929
## 11:54:36 Read 12599 rows and found 30 numeric columns
## 11:54:36 Using Annoy for neighbor search, n_neighbors = 50
## 11:54:36 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 11:54:38 Writing NN index file to temp file /var/folders/rx/vvf3pzf55v912fz97g52nz3m0000gn/T//RtmpmzbxDj/file84b5119e1f81
## 11:54:38 Searching Annoy index using 1 thread, search_k = 5000
## 11:54:45 Annoy recall = 100%
## 11:54:46 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 50
## 11:54:48 Initializing from normalized Laplacian + noise (using irlba)
## 11:54:49 Commencing optimization for 200 epochs, with 865768 positive edges
## 11:55:03 Optimization finished
DimPlot(denovo_AML_data,reduction = "umap", pt.size = 0.3)

ggsave(paste0("UMAP all samples "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 8, dpi = 300)


DimPlot(denovo_AML_data,reduction = "umap", pt.size = 0.3, split.by = "orig.ident", ncol = 2)

ggsave(paste0("UMAP 3 samples separate "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 12, height = 10, dpi = 300)

As AML3 seems completely off, we are removing it from the dataset for further analysis

(Below is trying to do batch correction but harmony somehow cannot be downloaded)

denovo_AML_data_no3 <- subset(denovo_AML_data, subset = orig.ident != "AML_3")
setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon 2024")

DimPlot(denovo_AML_data_no3,reduction = "umap", pt.size = 0.3)

ggsave(paste0("UMAP 2 samples "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 8, dpi = 300)


DimPlot(denovo_AML_data_no3,reduction = "umap", pt.size = 0.3, split.by = "orig.ident", ncol = 2)

ggsave(paste0("UMAP 2 samples separate "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 12, height = 10, dpi = 300)

Generating SNN Clusters

FindNeighbors(denovo_AML_data_no3, reduction = "pca", dims = 1:30) -> denovo_AML_data_no3
## Computing nearest neighbor graph
## Computing SNN
denovo_AML_data_no3@graphs$RNA_snn[1:10,1:10]
## 10 x 10 sparse Matrix of class "dgCMatrix"
##   [[ suppressing 10 column names 'AAACGGGCACCAGTTA-1_1', 'AAACGGGGTACCGGCT-1_1', 'AAAGATGAGCCGTCGT-1_1' ... ]]
##                                                           
## AAACGGGCACCAGTTA-1_1 1 . . . . . . .          . .         
## AAACGGGGTACCGGCT-1_1 . 1 . . . . . .          . .         
## AAAGATGAGCCGTCGT-1_1 . . 1 . . . . .          . .         
## AAAGATGCACTTCGAA-1_1 . . . 1 . . . .          . .         
## AAAGATGTCCCAAGAT-1_1 . . . . 1 . . .          . .         
## AAAGCAACACAGACAG-1_1 . . . . . 1 . .          . .         
## AAAGCAACATAAAGGT-1_1 . . . . . . 1 .          . .         
## AAAGCAACATCCTTGC-1_1 . . . . . . . 1.00000000 . 0.08108108
## AAAGCAATCTACCTGC-1_1 . . . . . . . .          1 .         
## AAAGTAGAGGATGCGT-1_1 . . . . . . . 0.08108108 . 1.00000000
FindClusters(denovo_AML_data_no3, resolution = 0.1) -> denovo_AML_data_no3
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6597
## Number of edges: 230574
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9291
## Number of communities: 6
## Elapsed time: 1 seconds
head(denovo_AML_data_no3$seurat_clusters, n=50)
## AAACGGGCACCAGTTA-1_1 AAACGGGGTACCGGCT-1_1 AAAGATGAGCCGTCGT-1_1 
##                    0                    0                    1 
## AAAGATGCACTTCGAA-1_1 AAAGATGTCCCAAGAT-1_1 AAAGCAACACAGACAG-1_1 
##                    0                    0                    2 
## AAAGCAACATAAAGGT-1_1 AAAGCAACATCCTTGC-1_1 AAAGCAATCTACCTGC-1_1 
##                    0                    0                    2 
## AAAGTAGAGGATGCGT-1_1 AAAGTAGTCCAAATGC-1_1 AAAGTAGTCGTAGGAG-1_1 
##                    0                    0                    1 
## AAATGCCAGGACCACA-1_1 AAATGCCAGTTAAGTG-1_1 AAATGCCCATTTGCCC-1_1 
##                    0                    0                    0 
## AACACGTAGAACAATC-1_1 AACACGTAGGACGAAA-1_1 AACCATGTCTTGTCAT-1_1 
##                    0                    0                    1 
## AACCGCGAGAAGAAGC-1_1 AACCGCGCAAAGTCAA-1_1 AACCGCGCAGCTCGCA-1_1 
##                    0                    0                    0 
## AACCGCGCATCTCGCT-1_1 AACCGCGGTTCTGTTT-1_1 AACCGCGTCCAGATCA-1_1 
##                    0                    3                    0 
## AACGTTGAGGATATAC-1_1 AACGTTGCACTAAGTC-1_1 AACGTTGCAGCGATCC-1_1 
##                    0                    0                    0 
## AACGTTGGTCAAAGCG-1_1 AACGTTGGTCCCTACT-1_1 AACGTTGGTTCCCTTG-1_1 
##                    0                    0                    0 
## AACTCCCAGTGTACGG-1_1 AACTCCCCACCACGTG-1_1 AACTCTTCACGAGGTA-1_1 
##                    1                    2                    0 
## AACTCTTGTGCAACGA-1_1 AACTCTTGTTCTGTTT-1_1 AACTCTTTCTCGGACG-1_1 
##                    0                    0                    0 
## AACTGGTAGTGCGTGA-1_1 AACTGGTCAATCCGAT-1_1 AACTGGTGTGTAATGA-1_1 
##                    0                    4                    0 
## AACTTTCAGATGTAAC-1_1 AACTTTCGTGAAATCA-1_1 AAGACCTAGCGTCTAT-1_1 
##                    1                    3                    1 
## AAGGAGCCAAAGGTGC-1_1 AAGGCAGCAGCCTATA-1_1 AAGTCTGTCAGTTCGA-1_1 
##                    2                    0                    3 
## AATCCAGCACCGCTAG-1_1 AATCCAGTCCGAAGAG-1_1 AATCGGTCACCGTTGG-1_1 
##                    0                    0                    0 
## ACACCAACATGCTAGT-1_1 ACACCAAGTCTCTTAT-1_1 
##                    0                    1 
## Levels: 0 1 2 3 4 5

Plot clusters on UMAP

setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon 2024")

DimPlot(denovo_AML_data_no3,reduction = "umap", pt.size = 0.3, label = TRUE, label.size = 8)

ggsave(paste0("UMAP 2 samples with SNN clusters resolution 0.1 "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 8, dpi = 300)

DimPlot(denovo_AML_data_no3,reduction = "umap", pt.size = 0.3, label = TRUE, label.size = 4, split.by = "orig.ident", ncol = 2) 

ggsave(paste0("UMAP 2 samples separate with SNN clusters resolution 0.1 "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 12, height = 10, dpi = 300)